####
# BOX PLOTS OF EVICTED VS CONTROL GROUPS
####


###packages
rm(list=ls())
gc()
library(data.table,lib.loc="/apps/R/lib")
library(dplyr,lib.loc="/apps/R/lib")
library(dtplyr,lib.loc="/apps/R/lib")
library(Hmisc,lib.loc="/apps/R/lib")
library(stringr,lib.loc="/apps/R/lib")
library(stargazer,lib.loc="/apps/R/lib")
library(locfit,lib.loc="/apps/R/lib")
library(RColorBrewer,lib.loc="/apps/R/lib")
library(grid,lib.loc="/apps/R/lib")
library(gtable,lib.loc="/apps/R/lib")
library(sandwich,lib.loc="/apps/R/lib")
library(parallel,lib.loc="/apps/R/lib")
library(haven,lib.loc="/apps/R/lib")
library(lubridate,lib.loc="/apps/R/lib")
library(bit64,lib.loc="/apps/R/lib")

## hepler functions
comb <- function(...) parse(text=paste0(...))

###reading in relevant datasets
#controls & characs
ctrldt <- fread("/build/evic_acs_all_characs_ctrls.csv")
#link
link <- fread("/build/cases_w_pik_all_cases_no_dups_allvars.csv")
#eviction data
evic <- fread("/build/evic_data_ready_with_instruments.csv")

#outcome datasets
lehd <- setDT(read_dta("/build/iv_fin_lehd.dta"))

fixpik <- function(dt) {
  dt[!is.na(as.numeric(pik)), temppik := sprintf("%09d",as.numeric(pik)) ]
  dt[, pik := NULL]
  dt[, pik := temppik]
  dt[, temppik := NULL]
}

#applying to all datatables
lapply(list(ctrldt,link,hmis,lehd), fixpik)

###prepping datasets

#fix tract
ctrldt[is.na(tract), tract := sprintf("%06d",as.numeric(tract))]

#adding ctrldt to link
drops = names(link)[!(names(link) %in% c("unique_id","pik"))]
keeps = names(ctrldt)[!(names(ctrldt) %in% drops)]
ctrldt = merge(link,ctrldt, by = c("unique_id","pik"), all.x = T, all.y = F)

#adding eviction data
drops = names(ctrldt)[!(names(ctrldt) %in% c("unique_id"))]
keeps = names(evic)[!(names(evic) %in% drops)]
ctrldt = merge(ctrldt,unique(evic[,keeps, with = F]), by = "unique_id",all.x = T, all.y =T)

#building some more controls 
ctrldt[, ':=' (age = as.numeric(Case_Year) - as.numeric(dob),
               fem_black = black == T & female == T,
               fem_white = white == T & female == T)]
ctrldt[, ':=' (age_sq = age^2,
               age_cub = age^3)]
ctrldt[, case_year_fac := as.factor(case_year)]

##ADD WEIGHTS AND ADD CTRLS
#function to drop any vars in dt that are shared with ctrldt and merge in ctrldt to a given dataset dt, also add miss vars
add_ctrls <- function(dt, mergeby = "unique_id") {
  drops = names(ctrldt)[!(names(ctrldt) %in% mergeby)]
  keeps = names(dt)[!(names(dt) %in% drops)]
  
  dttemp = dt[,keeps, with = F]
  #adding indicator for being in the dataset
  dttemp[, in_data := T]
  
  dttemp = unique(dttemp)
  dttemp <- merge(dttemp,ctrldt, by = mergeby, all.x  =T, all.y = F)
  
  #now add miss vars
  dttemp[, ':=' (age_miss = is.na(age),
                 fem_miss = is.na(female),
                 black_miss = is.na(black))]
  
  #now replace missing with means
  dttemp[, ':=' (age = ifelse(is.na(age),mean(age,na.rm = T), age),
                 female = ifelse(is.na(female), mean(female,na.rm = T), female),
                 black = ifelse(is.na(black), mean(black, na.rm = T), black))]
  
  ##now adding weights by tract to the acs sample
  #get acs tracts
  acstract = dttemp[in_data == T & (acs_placebo == T | (acs_placebo == F & in_acs_and_evic == T)),
                    .(acs_count = sum(acs_pwgt), acs_count_obs = .N), by = tract]
  evictract = dttemp[in_data == T & acs_placebo == F, .(evic_count = .N), by = tract]
  
  acstract <- merge(evictract,acstract, by = "tract",all.x = T, all.y = T)
  
  #now get weighted.. its evicted/acs counts (not accounting for acs_pwgts since not in this data set.)
  acstract[acs_count > 0, tract_wgt := evic_count/acs_count] # ratio of evic counts to weighted acs counts
  acstract[is.na(tract_wgt), tract_wgt := 0]
  
  acstract[acs_count_obs > 0, tract_wgt_old := evic_count/acs_count_obs]
  acstract[is.na(tract_wgt_old), tract_wgt_old := 0]
  #merge into dttemp
  dttemp <- merge(dttemp,acstract[,.(tract,tract_wgt, tract_wgt_old)], by = "tract",all.x = T, all.y = F)
  
  #add wgt of 1 for evicted
  dttemp[acs_placebo == F, tract_wgt := 1]
  dttemp[acs_placebo == T, tract_wgt := tract_wgt * acs_pwgt]
  return(dttemp)  
}


##building a dataset of just unique_id and weights
acstract = ctrldt[(acs_placebo == T | (acs_placebo == F & in_acs_and_evic == T)), 
                  .(acs_count = sum(acs_pwgt)), by = tract]
evictract = ctrldt[acs_placebo == F, .(evic_count = .N), by = tract]

acstract <- merge(evictract,acstract, by = "tract",all.x = T, all.y = T)

#now get weighted.. its evicted/acs counts
acstract[acs_count > 0, tract_wgt := evic_count/acs_count]
acstract[is.na(tract_wgt), tract_wgt := 0]

#merge into dttemp
uidtract <- merge(ctrldt,acstract[,.(tract,tract_wgt)], by = "tract",all.x = T, all.y = F)

#add wgt of 1 for evicted
uidtract[acs_placebo == T, tract_wgt := tract_wgt * acs_pwgt]
uidtract[acs_placebo == F, tract_wgt := 1]

uidtract <- unique(uidtract[,.(unique_id,tract_wgt,tract)])
setnames(uidtract, "tract","tract_for_wgt")
#save uidtract
fwrite(uidtract, "/raw/uniqueid_tractwt_cwk.csv")
write_dta(uidtract, "/raw/uniqueid_tractwt_cwk.dta")

###MAIN FUNCTION
#dt is a data.table that contains all the variables below:
#samp1 is a string that defines sample 1 (ie "evicted == 1 & iv_samp == T")
#samp2 is a string that defines sample 2 similar to samp1
#ctrls is a vector of controls
#indep_name is what you want to call the independent var of interest
boxplot_dt <- function(dt,samp1,samp2,samp1_name,samp2_name,ctrls =NA,ctrl_name,outs,state_count = F, state_count_time = NA,weight_var = NA) {
  dt[, temp_ind := NULL]
  
  #make sure samp1 and samp2 partition their union
  samptest = paste0("(",samp1,") & (",samp2,")")
  if (dt[eval(comb(samptest)),.N] > 0) {
    stop("FAILED TO CONTINUE: samp1 and samp2 have a nontrivial intersection, that should not happen!")
  }
  
  #make sure ctrls are in dt..
  if (!(all(ctrls %in% names(dt))) & !is.na(ctrls)[1]) {
    stop("FAILED TO CONTINUE: some of the ctrls are not in dt :(")
  }
  
  temp = dt[eval(comb(samp1)) | eval(comb(samp2)),]
  
  temp[eval(comb(samp1)), temp_ind := T]
  temp[eval(comb(samp2)), temp_ind := F]
  
  if(any(temp[,.N, by = temp_ind]$N == 0)){
    stop("one samp has 0 observations")
  }
  
  ##now running regression
  
  #setting up controls
  if (is.na(ctrls)[1]) {
    lhs = "temp_ind"
  } else {
    lhs = paste(c("temp_ind",ctrls), collapse = " + ")
  }
  
  #see if have weights
  if (is.na(weight_var)) {
    have_weights = F
  } else {
    have_weights = temp[,min(get(weight_var),na.rm = T) != max(get(weight_var),na.rm = T)]
  }

  #initiate a dt to hold results
  out_dt = data.table()
  count_dt = data.table()
  
  ##STARTING LOOP FOR OUTCOMES
  for (out in outs) {
    
    print(out)
    
    if(!(out %in% names(temp))) {
      print(paste0("VARIABLE: ",out," does not exist in the dataset, moving to next"))
      next
    }
    
    #making Nan,-Inf and others NA
    temp[is.na(get(out)) | is.infinite(get(out)) | is.nan(get(out)), c(out) := NA]
    
    strctrls = paste(ctrls, collapse = ") & !is.na(")
    strctrls = paste0("!is.na(",strctrls,")")
    
    form = paste(out," ~ ",lhs)
    
    #putting a check if no data...
    if(any(temp[eval(comb(strctrls)),all(is.na(get(out))), by = temp_ind]$V1 == T)){
      print(paste("IGNORING SPEC: ",form,", no data available"))
      warning(paste("IGNORING SPEC: ",form,", no data available"))
      next
    }
    
    form = as.formula(form)
    
    #use na.exclude to pad NAs on residuals.. changes nothing result wise otherwise
    if (is.na(weight_var)) {
      #running regression
      lm1 = lm(form,data = temp,na.action = na.exclude) 
    } else {
      temp[, temp_wgt := NULL]
      temp[, temp_wgt := get(weight_var)]
      lm1 = lm(form,data = temp,na.action = na.exclude, weights = temp_wgt)
    }
    
    lmdt = as.data.table(summary(lm1)$coefficients, keep.rownames = T)
    
    #add residuals... a count is a non missing residual because we used na.exclude
    temp[, temp_resid := residuals(lm1)]
    
    #full nobs in regression
    nobs = temp[!is.na(temp_resid),.N]
    
    #to get counts,need to binarize out if cts, or otherwise leave
    isbinary = temp[!is.na(get(out)), length(unique(get(out))) == 2]
    print(paste0("isbinary value is:",isbinary))
    
    #binarizing
    temp[, bin_out := NULL]
    
    if(isbinary) {
      temp[, bin_out := get(out)]
    } else {
      temp[, bin_out := "is a continuous variable"]
    }
    
    #now that we have residuals, we also can take group means conditional on appearing in the data used for the regression
    #need to weight by weight_var if exists
    #first element is for temp_resid == T
    if (is.na(weight_var)) {
      groupmeans = c(temp[!is.na(temp_resid) & temp_ind== T, mean(get(out),na.rm = T)],
                     temp[!is.na(temp_resid) & temp_ind == F, mean(get(out),na.rm = T)])
    } else {
      #weighted
      groupmeans = c(temp[!is.na(temp_resid) & temp_ind == T, mean(rep(get(out),temp_wgt),na.rm = T)],
                     temp[!is.na(temp_resid) & temp_ind == F, mean(rep(get(out),temp_wgt),na.rm = T)])
    }
    
    #get state count
    if (state_count == T){
      counterdt = temp[!is.na(temp_resid), .(num_states = 0), by = c("temp_ind","bin_out")]
      
      #13 states for count
      for (t in 1:13) {
        countvar = paste0("state_ind",t,"_",state_count_time)
        countertemp = temp[!is.na(temp_resid), .(state_temp = any(get(countvar),na.rm = T)), by = c("temp_ind","bin_out")]
        countertemp[is.na(state_temp), state_temp := 0]
        
        #updating count
        counterdt = merge(counterdt,countertemp, by = c("temp_ind","bin_out"), all.x = T, all.y = F)
        counterdt[, num_states := num_states + state_temp]
        counterdt[, state_temp := NULL]
      }
      
      #get counts by temp_ind and bin_out
      tempcount = temp[!is.na(temp_resid), 
                       .(count = .N), by = c("temp_ind","bin_out")]
      
      #add counterdt
      tempcount <- merge(tempcount,counterdt, by = c("temp_ind","bin_out"),all.x = T)
      
      
    } else {
      state_count_val = NA
      
      #get counts by temp_ind and bin_out
      tempcount = temp[!is.na(temp_resid), 
                       .(count = length(unique(pik)),
                         num_states = state_count_val), by = c("temp_ind","bin_out")]
    }
    
    #we finally want to rename each category similar to the xls file
    tempcount[,category := paste0("OLS w ctrls=",ctrl_name,", ",out,"=",bin_out,",",samp1_name,"=",temp_ind,",",samp2_name,"=",!temp_ind)]
    tempcount = tempcount[, .(category,count,num_states)]

    #now back to getting dt for regression output
    tempout = lmdt[rn == "temp_indTRUE",]
    tempout[, rn := NULL]
    setnames(tempout,c("est_samp1_minus_samp2","std_error","tval","pval"))
    tempout[, outcome := out]
    tempout[, controls := ctrl_name]
    tempout[, samp1 := samp1_name]
    tempout[, samp2 := samp2_name]
    #adding group means
    tempout[, samp1_group_mean := groupmeans[1]]
    tempout[, samp2_group_mean := groupmeans[2]]
    #adding weight indicator
    tempout[, weighted := have_weights]
    tempout[, N := nobs]

    #now rbind
    out_dt <- rbindlist(list(out_dt,tempout),fill = T)
    count_dt <- rbindlist(list(count_dt,tempcount), fill = T)
  }
  
  #return out_dt
  return(list(out_dt,count_dt))
}

###FUNCTION TO IMPLEMENT THIS ON A LIST OF CONTROLS SAMPLE NAMES, ETC
##ALSO BUILDING STATE COUNTS IF WEIGHT_VAR PROVIDED
apply_boxplot <- function(dt,samples,samp_names,controls,control_names,outs,time_ints,state_count = F, state_count_time = NA, weight_var = NA) {
  
  #intiating
  masterdt_reg = data.table()
  masterdt_count = data.table()
  
  for (i in 1:length(samples)) {
    
    samp1 = samples[[i]][1]
    samp2 = samples[[i]][2]
    
    samp1_name = samp_names[[i]][1]
    samp2_name = samp_names[[i]][2]
    
    print(paste(samp1_name,samp2_name))
    
    for (i in 1:length(controls)) {
      
      ctr = controls[[i]]
      ctrl_name = control_names[i]
      print(ctrl_name)
      
      for (time in time_ints) {
        print(time)
        outtimes = paste0(outs,time)
        
        
        if(is.na(state_count_time)) {
          state_count_time = time
        }
        temp = boxplot_dt(dt = dt, samp1 = samp1, samp2 = samp2, samp1_name = samp1_name, samp2_name = samp2_name, 
                          ctrls = ctr, ctrl_name = ctrl_name, outs = outtimes, state_count = state_count, state_count_time = state_count_time, weight_var = weight_var)
        tempreg = temp[[1]]
        tempcount = temp[[2]]
        
        masterdt_reg <- rbindlist(list(masterdt_reg,tempreg), fill = T)
        masterdt_count <- rbindlist(list(masterdt_count,tempcount), fill = T)
      }
    }
  }
  return(list(masterdt_reg,masterdt_count))
}

###USING FUNCTION ON DATASETS
#for saving..
outpath = "../Inputs/"

##DEFINING SAMPLES
#samp_1 is evicted vs acs that is either not in court or in evicted sample but not evicted
#samp_2 is evicted vs not evicted
samples = list(c("evicted == 1 & acs_placebo == 0","(acs_placebo == 1) | (in_acs_and_evic == 1 & evicted == 0)"),
               c("evicted == 1 & acs_placebo == 0","acs_placebo == 0  & evicted == 0"))

samp_names = list(c("evicted","acs_renter"),
                  c("evicted","not_evicted"))


##DEFINING CONTROLS
control = c("age","age_sq","age_cub","female","black","white","fem_black","fem_white",
            "age_miss","fem_miss","black_miss","case_year_fac",
            "marf_rollyr_5_all_hh_pov_rate", "marf_rollyr_5_rent_hh_median_ren", "marf_rollyr_5_all_hh_median_inco")

controls = list(NA,
                control)

control_names = c("no_ctrl","all_ctrl")

##DEFINING WEIGHT VAR
weight_var = "tract_wgt"

##getting mob dt ready
lehd_ready = add_ctrls(dt = lehd, mergeby = "unique_id")

##UPDATED LEHD CONTROLS

control = c("age","age_sq","age_cub","female","black","white","fem_black","fem_white",
            "age_miss","fem_miss","black_miss","case_year_fac",
            "marf_rollyr_5_all_hh_pov_rate", "marf_rollyr_5_rent_hh_median_ren", "marf_rollyr_5_all_hh_median_inco")
earn_controls = unlist(strsplit("earn_b1 earn_b2 earn_b3 earn_b4 earn_b1_miss earn_b2_miss earn_b3_miss earn_b4_miss earn_b8to1 earn_b8to1_miss", " "))
emp_controls = unlist(strsplit("anyearn_allst_b1 anyearn_allst_b2 anyearn_allst_b3 anyearn_allst_b4 anyearn_allst_b1_miss anyearn_allst_b2_miss anyearn_allst_b3_miss anyearn_allst_b4_miss anyearn_allst_b8to1 anyearn_allst_b8to1_miss", " "))

allctrls = c(control,earn_controls,emp_controls)
somectrls = c(control)
controls = list(NA, allctrls, somectrls)
control_names = c("no_ctrl","all_ctrl", "some_ctrls")

##BUILDING MISS VARS
for (out in grep("_miss",allctrls,value = T)) {
  if (out %in% names(lehd_ready)) {
    next
  }
  
  og_missvar = gsub("_miss","",out)
  
  if (!(og_missvar %in% names(lehd_ready))) {
    print(paste0(og_missvar," is not in lehd_ready"))
    next
  }
  
  lehd_ready[, c(out) := is.na(get(og_missvar))]
  lehd_ready[, c(og_missvar) := ifelse(is.na(get(og_missvar)), mean(get(og_missvar),na.rm = T), get(og_missvar))]
  
}


##DEFINING OUTCOMES

lehdtimes = c("1_8")
lehdouts = c("earn_","anyearn_allst_")

##RUNNING FUN
lehd_box = apply_boxplot(dt = lehd_ready,samples=samples,samp_names=samp_names,
                         controls=controls,control_names = control_names,outs = lehdouts,time_ints = lehdtimes,state_count = T, state_count_time = NA, weight_var = weight_var)

lehd_results = lehd_box[[1]]
lehd_results <- lehd_results[,.(outcome,controls,samp1,samp2,
                              samp1_group_mean = round(samp1_group_mean,4),
                              samp2_group_mean = round(samp2_group_mean,4),
                              est_samp1_minus_samp2 = round(est_samp1_minus_samp2,4),
                              std_error = round(std_error,4),
                              pval = round(pval,4),
                              N= round(N, 4))]

lehd_counts = lehd_box[[2]]

#save results
fwrite(lehd_results, paste0(outpath,"/Cook_barchart_regs.csv"))
